# Read data
mean_coverage <- read.table("./anvio_output/rebin/mean_coverage_selected_final.tsv", header = TRUE)
std_coverage <- read.table("./anvio_output/rebin/std_coverage_selected_final.tsv", header = TRUE)
bin_size <- read.table("./anvio_output/rebin/general_bins_summary_selected_final.tsv", header = TRUE)[, c(2,5)]
total_reads <- read.table("./anvio_output/sample_reads.tsv", header = TRUE)
read_length <- 150
# From wide to long format
mean_coverage_long <- gather(mean_coverage, Sample_ID, coverage,
Fa13_BD_MLB_DN:Su13_BD_MM15_SN_C, factor_key=TRUE)
mean_coverage_long[,2] <- gsub(mean_coverage_long[,2], pattern = "_C",
replacement = "")
std_coverage_long <- gather(std_coverage, Sample_ID, std_coverage,
Fa13_BD_MLB_DN:Su13_BD_MM15_SN_C,
factor_key=TRUE)
std_coverage_long[,2] <- gsub(std_coverage_long[,2], pattern = "_C",
replacement = "")
coverage_data <- data.frame(mean_coverage_long,
std_coverage = std_coverage_long[,3])
# Read and add metadata
meta <- read.csv2("metadata.csv")
meta$Sample_ID <- gsub(meta$Sample_ID, pattern = ".", replacement = "_", fixed = TRUE)
data_total <- left_join(coverage_data, total_reads, by = c("Sample_ID" = "sample"))
data_total <- left_join(data_total, bin_size, by = "bins")
data_total <- left_join(data_total, meta, by = "Sample_ID")
# Calculate relative abundance of the bins
data_total$mean_rel_abundance <- 100*(data_total$coverage*data_total$bin_size)/(read_length*data_total$Total_reads)
data_total$upper_rel_abundance <- 100*((data_total$coverage+data_total$std_coverage)*data_total$bin_size)/(read_length*data_total$Total_reads)
data_total$lower_rel_abundance <- 100*((data_total$coverage-data_total$std_coverage)*data_total$bin_size)/(read_length*data_total$Total_reads)Annotated phylogenomic tree
# read file
data.ANI <- read.table("./ANI/ANIb_percentage_identity.tab")
# Replace genome names by better annotated names
map_ani <- read.delim("./ANI/pyani_ref_names.tsv", stringsAsFactors = FALSE)
for(i in 1:nrow(map_ani)){
colnames(data.ANI)[colnames(data.ANI) %in% map_ani$ani_file[i]] <- map_ani$ref_name[i]
rownames(data.ANI)[rownames(data.ANI) %in% map_ani$ani_file[i]] <- map_ani$ref_name[i]
}
# Remove pnec and root genomes from dataframe
data.ANI <- data.ANI[grep("MAG|Lim", rownames(data.ANI)), grep("MAG|Lim", rownames(data.ANI))]
# Order y-axis according to phylogenetic tree order
ord_list_bin <- c(
"MAG5.SP-M110-DD", "MAG2.FA-MLB-SN",
"MAG3.FA-MLB-SN", "MAG4.FA-M110-DN",
"MAG1.FA-MLB-DN", "MAG10.SU-M15-SN",
"MAG6.SP-M15-SD","MAG8.SU-M110-DCMD",
"Lim. sp. Rim11", "Lim. sp. 103DPR2",
"MAG7.SU-MLB-SD", "Lim. sp. Rim47",
"MAG9.SU-M15-SN", "Lim. sp. Rim28",
"Lim. sp. 63ED37-2","Lim. sp. 2KL-27",
"Lim. sp. 2KL-3", "Lim. sp. DM1",
"Lim. sp. II-D5"
)
# order rows and columns
data.ANI <- data.ANI[, order(match(colnames(data.ANI), ord_list_bin))]
data.ANI <- data.ANI[order(match(rownames(data.ANI), ord_list_bin)), ]
data.ANI <- get_upper_tri(data.ANI)
# melt to dataframe
df_pyani <- melt(as.matrix(data.ANI), na.rm = TRUE) # reshape into dataframe
names(df_pyani)[c(1:3)] <- c("bin1", "bin2", "ANI")
df_pyani[, 1] <- as.character(df_pyani[, 1]); df_pyani[, 2] <- as.character(df_pyani[, 2])
df_pyani$bin2 <- factor(df_pyani$bin2, levels = ord_list_bin)
df_pyani$bin1 <- factor(df_pyani$bin1, levels = rev(ord_list_bin))
# make plot
p_ani <- ggplot(data = df_pyani,
aes(x = bin2, y = bin1, fill = 100*ANI))+
geom_raster()+
scale_fill_distiller(palette = "RdBu", name = "Average\nNucleotide\nIdentity (%)\n",
limits = 100*c(0.75,1.0), oob=squish) +
geom_text(aes(label = round(100*ANI, 0)), size = 4.5)+
xlab("")+
ylab("")+
scale_x_discrete(position = "top") +
theme(axis.title=element_text(size=14), strip.text.x=element_text(size=14),
legend.title=element_text(size=14), legend.text=element_text(size=14),
axis.text.y = element_blank(),
axis.text.x = element_text(size=14, angle = 55, hjust = 0),
title=element_text(size=20),
plot.margin = unit(c(1.1,1.1,1.1,1.1), "cm"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "transparent",colour = NA),
plot.background = element_rect(fill = "transparent",colour = NA)
)
# png("ANI_heatmap.png", height= 9, width = 9, res = 500, units = "in", bg = "transparent")
print(p_ani)# dev.off()# import data
df_phy <- import_mothur(mothur_shared_file = "./16S/stability.trim.contigs.good.unique.good.filter.unique.precluster.pick.pick.an.unique_list.shared",
mothur_constaxonomy_file = "./16S/stability.trim.contigs.good.unique.good.filter.unique.precluster.pick.pick.an.unique_list.0.03.cons.taxonomy")
# Filter out 2013 samples
df_phy <- prune_samples(grep(pattern = ".", sample_names(df_phy), fixed = TRUE,
value = TRUE), df_phy)
df_phy <- prune_samples(grep(pattern = "cD", sample_names(df_phy), fixed = TRUE,
value = TRUE, invert = TRUE), df_phy)
# Perform prevalence filtering
df_phy <- filter_taxa(df_phy, function(x) sum(x > 30) > (0.25*length(x)), TRUE)
# Run spiec-easi
sp_easi <- spiec.easi(df_phy, method='mb', lambda.min.ratio=1e-2,
nlambda=20, icov.select.params=list(rep.num=50))## Normalizing/clr transformation of data with pseudocount ...
## Inverse Covariance Estimation with mb ...
## Model selection with stars ...
## Done!
ig.mb <- adj2igraph(sp_easi$refit, vertex.attr = list(name=taxa_names(df_phy)))
vsize <- Biobase::rowMax(clr(t(otu_table(df_phy)), 1))+10
Lineage_rel <- tax_table(df_phy)[,"Rank2"]
Lineage_rel <- factor(Lineage_rel, levels = unique(Lineage_rel))
# OTUs that are Limnohabitans
limno_otus <- taxa_names(df_phy)[tax_table(df_phy)[,"Rank6"] == "betI_A"]
limno_otus <- limno_otus[!is.na(limno_otus)]
# Make Limno label
limno_labs <- c()
limno_labs[vertex.attributes(ig.mb)$name %in% limno_otus] <- "Limnohabitans sp."
limno_labs[is.na(limno_labs)] <- ""
# Plot network
p_16S_network <- plot_network_custom(ig.mb, df_phy, type='taxa',
line_weight = 2, hjust = 0.5,
point_size = 0.1, alpha = 0.01, label=NULL, label_size = 3.95)+
scale_fill_brewer(palette = "Paired")+
scale_color_brewer(palette = "Paired")+
geom_point(aes(size = vsize, fill = Lineage_rel), alpha = 0.5,
colour="black", shape=21)+
guides(size = FALSE,
fill = guide_legend(title = "Phylum", override.aes = list(size = 5),
nrow = 4),
color = FALSE)+
theme(legend.position="bottom", legend.text=element_text(size=12),
text = element_text(size = 12),
plot.margin = unit(c(1,1,1,1), "cm"))+
scale_size(range = c(5, 15))+
geom_label_repel(aes(label = limno_labs), fontface = 'bold', color = 'black',
box.padding = 0.35, point.padding = 0.5,
segment.color = 'black',
size = 4,
# Width of the line segments.
segment.size = 1.5,
# Draw an arrow from the label to the data point.
arrow = arrow(length = unit(0.015, 'npc')),
nudge_x = -0.1,
nudge_y = 0.6
)
print(p_16S_network)Formula used to calculate relative abundances: \[Relative\ abundance =100*(\frac{mean\ coverage * bin\ size}{read\ length*total\ sample\ reads })\]
# Normalize for bin sizes
data_total <- data_total %>% group_by(bins) %>%
mutate(norm_mean_rel_abundance = mean_rel_abundance/(bin_size/1e6),
norm_upper_rel_abundance = upper_rel_abundance/(bin_size/1e6),
norm_lower_rel_abundance = lower_rel_abundance/(bin_size/1e6)
)
# Plot abundance distributions of all bins
p_season1 <- ggplot(data = data_total, aes(x = bins, y = norm_mean_rel_abundance, fill = bins))+
geom_point(size = 4, shape = 21, alpha = 0.7)+
geom_boxplot(alpha = 0.3)+
scale_fill_brewer(palette = "Paired")+
theme_bw()+
geom_errorbar(aes(ymin=norm_lower_rel_abundance,
ymax=norm_upper_rel_abundance,
width=.1))+
facet_grid(Season~Site)+
# ylim(0,1)+
theme(axis.text=element_text(size=14), axis.title=element_text(size=20),
title=element_text(size=20), legend.text=element_text(size=14),
legend.background = element_rect(fill="transparent"),
axis.text.x = element_text(angle = 90, hjust = 1),
strip.text=element_text(size=18))+
ylab("Normalized relative abundance (%)")
p_season1pileup.sh)From: https://doi.org/10.1186/s13059-015-0834-7
“The relative abundance of each MAG was estimated using the fraction of reads in each sample mapping to the respective MAG. Normalized on the size of that bin, this yielded the measure fraction of reads per nucleotide in bin. This measure was chosen since it is comparable across samples with varying sequencing output and different bin sizes. Using the CONCOCT input table, multiplying the average coverage per nucleotide with the length of the contig in question and summing over all contigs within a bin and within a sample gave the number of reads per bin within a sample. The fraction of reads in each sample mapping to each bin was then calculated by dividing this value with the total number of reads from each sample, after having removed duplicated reads.”
# Generated from SAM file with pileup.sh from BBmap toolset
df_map_reads <- read.delim("./anvio_output/collection_vizbin_pileup_cov.tsv",
stringsAsFactors = FALSE)
df_map_reads$Sample <- gsub(df_map_reads$Sample, pattern = "_cov.tsv",
replacement = "")
# This file was generated in anvio with anvi-export-collections
lab_map_reads <- read.delim("./anvio_output/collection_vizbin.tsv",
header = FALSE)
colnames(lab_map_reads) <- c("contig_split", "Bin")
# Select Limnohabitans bins
lab_map_reads <- lab_map_reads[lab_map_reads$Bin %in% data_total$bins, ]
lab_map_reads$contig <- paste(
do.call(rbind, strsplit(as.character(lab_map_reads$contig_split), "_"))[, 1],
do.call(rbind, strsplit(as.character(lab_map_reads$contig_split), "_"))[, 2],
sep = "_"
)
lab_map_reads <- lab_map_reads[ ,2:3] %>% distinct()
# Merge dataframes
df_map_merged <- left_join(lab_map_reads, df_map_reads, by = c("contig" = "X.ID")
)
df_map_merged[, 4:13] <- apply(df_map_merged[, 4:13], 2, FUN = function(x) as.numeric(x))
df_map_merged <- df_map_merged %>% group_by(Bin, Sample) %>% summarise(sum_map_read = sum(Plus_reads))
# Merge with metadata/bin sizes
meta$Sample <- gsub(".C", "", meta$Sample, fixed = TRUE)
meta$Sample <- gsub(".A", "", meta$Sample, fixed = TRUE)
df_map_merged$Sample <- gsub(".C", "", df_map_merged$Sample, fixed = TRUE)
df_map_merged <- left_join(df_map_merged, bin_size, by = c("Bin" = "bins"))
df_map_merged <- left_join(df_map_merged, meta, "Sample")
total_reads$sample <- gsub("_", ".", fixed = TRUE,total_reads$sample)
df_map_merged <- left_join(df_map_merged, total_reads, c("Sample" = "sample"))
df_map_merged <- df_map_merged %>% group_by(Bin, Sample) %>%
mutate(rel_abundance = 100*sum_map_read/Total_reads)
df_map_merged <- df_map_merged %>% group_by(Bin, Sample) %>%
mutate(rel_norm_abundance = 100*sum_map_read/Total_reads/(bin_size/1e6))
# Rename bin names to more sensible names
# Add extra column with new bin names
new_bin_names <- read.table("./anvio_output/rebin/general_bins_summary_selected_final.tsv", header = TRUE)[, c(2,3)]
df_map_merged <- left_join(df_map_merged, new_bin_names, by = c("Bin" = "bins"))
df_map_merged$new_bin_name <- as.character(df_map_merged$new_bin_name)
df_map_merged$new_bin_name <- factor(df_map_merged$new_bin_name, levels =
c("MAG1.FA-MLB-DN","MAG2.FA-MLB-SN",
"MAG3.FA-MLB-SN", "MAG4.FA-M110-DN",
"MAG5.SP-M110-DD","MAG6.SP-M15-SD",
"MAG7.SU-MLB-SD","MAG8.SU-M110-DCMD",
"MAG9.SU-M15-SN","MAG10.SU-M15-SN"))
df_map_merged$Site <- as.character(df_map_merged$Site)
df_map_merged$Site <- gsub("110", "Lake Michigan\nsite M110", df_map_merged$Site)
df_map_merged$Site <- gsub("15", "Lake Michigan\nsite M15", df_map_merged$Site)
df_map_merged$Site <- gsub("Buoy", "Muskegon Lake", df_map_merged$Site)
df_map_merged$Site <- factor(df_map_merged$Site, levels = c("Muskegon Lake",
"Lake Michigan\nsite M15",
"Lake Michigan\nsite M110"))
df_map_merged$Season <- as.character(df_map_merged$Season)
df_map_merged$Season <- factor(df_map_merged$Season, levels = c("Spring", "Summer","Fall"))
# Remove non-limno bin
df_map_merged <- df_map_merged %>% dplyr::filter(new_bin_name != "MAG.noLIM")
# Make plots
p_abs2 <- ggplot(data = df_map_merged, aes(x = new_bin_name, y = rel_norm_abundance, fill = new_bin_name))+
theme_bw()+
scale_fill_manual("", values = fill_palette)+
geom_jitter(size = 4, shape = 21, color = "black", alpha = 0.7, width = 0.15)+
theme(axis.text=element_text(size=14), axis.title=element_text(size=20),
title=element_text(size=20), legend.text=element_text(size=12),
legend.background = element_rect(fill="transparent"),
axis.text.x = element_blank(),
strip.text=element_text(size=14), legend.position = "bottom",
strip.background = element_rect(fill = adjustcolor("gray", 0.15)))+
ylab(paste0("Norm. relative abundance (%)"))+
xlab("")+
guides(fill=guide_legend(nrow = 3))+
facet_grid(Season~Site, scales ="free")+
scale_y_continuous(labels=scaleFUN, limits = c(0,1.25))+
coord_trans(y = "sqrt")
p_abs2## Error in f(...): object 'fill_palette' not found
# First we need the files that map the gene ID to the sequence ID (linux cmd: https://github.com/rprops/MetaG_lakeMI/wiki/11.-Genome-annotation)
# These are stored in the IMG_annotation data for each genome bin
# Next, extract the %GC of each gene from the gff file
Bin_2737471681 <- extract_gc_from_gff("./IMG_annotation/IMG_2737471681/IMG_Data/133052.assembled.gff",
outputFolder = "GC_analysis")## Error in file(file, "rt"): cannot open the connection
Bin_2737471682 <- extract_gc_from_gff("./IMG_annotation/IMG_2737471682/IMG_Data/133053.assembled.gff",
outputFolder = "GC_analysis")## Error in file(file, "rt"): cannot open the connection
Bin_2737471683 <- extract_gc_from_gff("./IMG_annotation/IMG_2737471683/IMG_Data/133054.assembled.gff",
outputFolder = "GC_analysis")## Error in file(file, "rt"): cannot open the connection
Bin_2737471793 <- extract_gc_from_gff("./IMG_annotation/IMG_2737471793/IMG_Data/133647.assembled.gff",
outputFolder = "GC_analysis")## Error in file(file, "rt"): cannot open the connection
Bin_2737471794 <- extract_gc_from_gff("./IMG_annotation/IMG_2737471794/IMG_Data/133648.assembled.gff",
outputFolder = "GC_analysis")## Error in file(file, "rt"): cannot open the connection
Bin_2737471795 <- extract_gc_from_gff("./IMG_annotation/IMG_2737471795/IMG_Data/133649.assembled.gff",
outputFolder = "GC_analysis")## Error in file(file, "rt"): cannot open the connection
Bin_2737471797 <- extract_gc_from_gff("./IMG_annotation/IMG_2737471797/IMG_Data/133651.assembled.gff",
outputFolder = "GC_analysis")## Error in file(file, "rt"): cannot open the connection
Bin_2737471799 <- extract_gc_from_gff("./IMG_annotation/IMG_2737471799/IMG_Data/133653.assembled.gff",
outputFolder = "GC_analysis")## Error in file(file, "rt"): cannot open the connection
Bin_2737471802 <- extract_gc_from_gff("./IMG_annotation/IMG_2737471802/IMG_Data/133656.assembled.gff",
outputFolder = "GC_analysis")## Error in file(file, "rt"): cannot open the connection
Bin_2737471804 <- extract_gc_from_gff("./IMG_annotation/IMG_2737471804/IMG_Data/133658.assembled.gff",
outputFolder = "GC_analysis")## Error in file(file, "rt"): cannot open the connection
Bin_2737471805 <- extract_gc_from_gff("./IMG_annotation/IMG_2737471805/IMG_Data/133659.assembled.gff",
outputFolder = "GC_analysis")## Error in file(file, "rt"): cannot open the connection
Bin_2737471806 <- extract_gc_from_gff("./IMG_annotation/IMG_2737471806/IMG_Data/133660.assembled.gff",
outputFolder = "GC_analysis")## Error in file(file, "rt"): cannot open the connection
# Use these files to make dataframes mapping function (COGs) and %GC
Bin_2737471681_gc_cog <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_133052.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2737471681/IMG_Data/2737471681/2737471681.gene_oid_2_seq_id.txt",
functions = "./IMG_annotation/IMG_2737471681/IMG_Data/2737471681/2737471681.cog.tab.txt",
gc_thresh = 0.1, output = FALSE, label = "Bin_2737471681")## Error in file(file, "rt"): cannot open the connection
Bin_2737471682_gc_cog <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_133053.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2737471682/IMG_Data/2737471682/2737471682.gene_oid_2_seq_id.txt",
functions = "./IMG_annotation/IMG_2737471682/IMG_Data/2737471682/2737471682.cog.tab.txt",
gc_thresh = 0.1, output = FALSE, label = "Bin_2737471682")## Error in file(file, "rt"): cannot open the connection
Bin_2737471683_gc_cog <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_133054.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2737471683/IMG_Data/2737471683/2737471683.gene_oid_2_seq_id.txt",
functions = "./IMG_annotation/IMG_2737471683/IMG_Data/2737471683/2737471683.cog.tab.txt",
gc_thresh = 0.1, output = FALSE, label = "Bin_2737471683")## Error in file(file, "rt"): cannot open the connection
Bin_2737471793_gc_cog <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_133647.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2737471793/IMG_Data/2737471793/2737471793.gene_oid_2_seq_id.txt",
functions = "./IMG_annotation/IMG_2737471793/IMG_Data/2737471793/2737471793.cog.tab.txt",
gc_thresh = 0.1, output = FALSE, label = "Bin_2737471793")## Error in file(file, "rt"): cannot open the connection
Bin_2737471794_gc_cog <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_133648.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2737471794/IMG_Data/2737471794/2737471794.gene_oid_2_seq_id.txt",
functions = "./IMG_annotation/IMG_2737471794/IMG_Data/2737471794/2737471794.cog.tab.txt",
gc_thresh = 0.1, output = FALSE, label = "Bin_2737471794")## Error in file(file, "rt"): cannot open the connection
Bin_2737471795_gc_cog <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_133649.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2737471795/IMG_Data/2737471795/2737471795.gene_oid_2_seq_id.txt",
functions = "./IMG_annotation/IMG_2737471795/IMG_Data/2737471795/2737471795.cog.tab.txt",
gc_thresh = 0.1, output = FALSE, label = "Bin_2737471795")## Error in file(file, "rt"): cannot open the connection
Bin_2737471797_gc_cog <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_133651.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2737471797/IMG_Data/2737471797/2737471797.gene_oid_2_seq_id.txt",
functions = "./IMG_annotation/IMG_2737471797/IMG_Data/2737471797/2737471797.cog.tab.txt",
gc_thresh = 0.1, output = FALSE, label = "Bin_2737471797")## Error in file(file, "rt"): cannot open the connection
Bin_2737471799_gc_cog <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_133653.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2737471799/IMG_Data/2737471799/2737471799.gene_oid_2_seq_id.txt",
functions = "./IMG_annotation/IMG_2737471799/IMG_Data/2737471799/2737471799.cog.tab.txt",
gc_thresh = 0.1, output = FALSE, label = "Bin_2737471799")## Error in file(file, "rt"): cannot open the connection
Bin_2737471802_gc_cog <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_133656.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2737471802/IMG_Data/2737471802/2737471802.gene_oid_2_seq_id.txt",
functions = "./IMG_annotation/IMG_2737471802/IMG_Data/2737471802/2737471802.cog.tab.txt",
gc_thresh = 0.1, output = FALSE, label = "Bin_2737471802")## Error in file(file, "rt"): cannot open the connection
Bin_2737471804_gc_cog <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_133658.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2737471804/IMG_Data/2737471804/2737471804.gene_oid_2_seq_id.txt",
functions = "./IMG_annotation/IMG_2737471804/IMG_Data/2737471804/2737471804.cog.tab.txt",
gc_thresh = 0.1, output = FALSE, label = "Bin_2737471804")## Error in file(file, "rt"): cannot open the connection
Bin_2737471805_gc_cog <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_133659.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2737471805/IMG_Data/2737471805/2737471805.gene_oid_2_seq_id.txt",
functions = "./IMG_annotation/IMG_2737471805/IMG_Data/2737471805/2737471805.cog.tab.txt",
gc_thresh = 0.1, output = FALSE, label = "Bin_2737471805")## Error in file(file, "rt"): cannot open the connection
Bin_2737471806_gc_cog <- gc2function(seq_id_gc = "GC_analysis/seqid_GC_133660.assembled.gff.tsv", gene_id_seq_id ="./IMG_annotation/IMG_2737471806/IMG_Data/2737471806/2737471806.gene_oid_2_seq_id.txt",
functions = "./IMG_annotation/IMG_2737471806/IMG_Data/2737471806/2737471806.cog.tab.txt",
gc_thresh = 0.1, output = FALSE, label = "Bin_2737471806")## Error in file(file, "rt"): cannot open the connection
merged_gc_cog <- rbind(Bin_2737471681_gc_cog, Bin_2737471682_gc_cog, Bin_2737471683_gc_cog,
Bin_2737471793_gc_cog, Bin_2737471794_gc_cog, Bin_2737471795_gc_cog,
Bin_2737471797_gc_cog, Bin_2737471799_gc_cog, Bin_2737471802_gc_cog,
Bin_2737471804_gc_cog, Bin_2737471805_gc_cog, Bin_2737471806_gc_cog)## Error in eval(quote(list(...)), env): object 'Bin_2737471681_gc_cog' not found
merged_gc_cog$genome_id <- as.character(merged_gc_cog$genome_id)## Error in eval(expr, envir, enclos): object 'merged_gc_cog' not found
Get COG ID to COG functional category mapping file here: ftp://ftp.ncbi.nih.gov/pub/wolf/COGs/COG0303/cogs.csv
The exact statistical analysis to compare genomes based on these profiles should be performed in STAMP.
# Import COG mapping file
cogid_2_cogcat <- read.csv("./Mapping_files/cogid_2_cogcat.csv", sep = ",", header = FALSE, fill = TRUE,col.names = c("COG_ID", "COG_class", "function"))[, 1:2]
cogid_2_cogcat <- cogid_2_cogcat[(cogid_2_cogcat$COG_class)!="", ]
cogid_2_cogcat <- droplevels(cogid_2_cogcat)
# Read COG category file
cog_categories <- read.table("./Mapping_files/cog_categories.tsv", header = TRUE, sep = "\t")
# Merge COG metadata
cog_meta <- dplyr::left_join(cog_categories, cogid_2_cogcat, by = c("COG_class" = "COG_class"))
cog_meta <- droplevels(cog_meta)
# Merge this metadata with the genome data from before
# COGs with multiple classifications are currently still NA - work on this.
merged_gc_cog <- dplyr::left_join(merged_gc_cog, cog_meta, by = c("cog_id" = "COG_ID"))## Error in dplyr::left_join(merged_gc_cog, cog_meta, by = c(cog_id = "COG_ID")): object 'merged_gc_cog' not found
merged_gc_cog <- merged_gc_cog[!is.na(merged_gc_cog$COG_functional_category),]## Error in eval(expr, envir, enclos): object 'merged_gc_cog' not found
# Visualize distribution across major metabolism functional COG groups per genome.
p_cog_func_group <- ggplot(data = merged_gc_cog, aes(x=COG_functional_category, fill = COG_functional_cluster))+
geom_bar(stat="count", width=0.7, color = "black", size = 0.75)+
theme_bw()+
facet_grid(genome_id~.)+
scale_fill_brewer(palette = "Accent")+
labs(x = "Gene length (bp)", y = "Count")+ theme(legend.position="bottom", axis.text.x = element_text(angle = 90, hjust = 1),
legend.text = element_text(size = 7))+
ggtitle("Limnohabitans MAGs")+
guides(fill=guide_legend(nrow=2,byrow=TRUE))## Error in ggplot(data = merged_gc_cog, aes(x = COG_functional_category, : object 'merged_gc_cog' not found
print(p_cog_func_group)## Error in print(p_cog_func_group): object 'p_cog_func_group' not found
p_cog_func_clust <- ggplot(data = merged_gc_cog, aes(x=COG_functional_cluster, fill = COG_functional_cluster))+
geom_bar(stat="count", width=0.7, color = "black", size = 0.75)+
theme_bw()+
facet_grid(genome_id~.)+
scale_fill_brewer(palette = "Accent")+
labs(x = "Gene length (bp)", y = "Count")+ theme(legend.position="bottom",axis.text.x = element_text(angle = 90, hjust = 1),
legend.text = element_text(size = 7))+
ggtitle("Limnohabitans MAGs")+
guides(fill=guide_legend(nrow=2,byrow=TRUE))## Error in ggplot(data = merged_gc_cog, aes(x = COG_functional_cluster, : object 'merged_gc_cog' not found
print(p_cog_func_clust)## Error in print(p_cog_func_clust): object 'p_cog_func_clust' not found
One way: 1. Get total nr. of reads through the samtools flagstat command. 2. Get gene length through the DESMAN command: python $DESMAN/scripts/Lengths.py -i CDS.fa > CDS.len
Chosen way: 1. Map reads with bwa 2. Extract mapping statistics with pileup.sh from BBtools
expr_cov <- read.delim("./metaT/metaT_pileup.tsv", header = TRUE,
stringsAsFactors = FALSE)
# Merge gene annotations from all genomes in one file
file_list <- list.files(path = "./IMG_annotation", recursive = FALSE,
pattern = "IMG", full.names = TRUE)
merged_file <- merge_annotations(file_list[1:10], genoid_seqid = FALSE)## --- I will merge the annotation files from the following genomes:
## Genomes
## 1 ./IMG_annotation/IMG_2757320395
## 2 ./IMG_annotation/IMG_2757320396
## 3 ./IMG_annotation/IMG_2757320397
## 4 ./IMG_annotation/IMG_2757320398
## 5 ./IMG_annotation/IMG_2757320399
## 6 ./IMG_annotation/IMG_2757320400
## 7 ./IMG_annotation/IMG_2757320401
## 8 ./IMG_annotation/IMG_2757320402
## 9 ./IMG_annotation/IMG_2757320403
## 10 ./IMG_annotation/IMG_2757320404
## [1] 2322
## [1] 3720
## [1] 5259
## [1] 7208
## [1] 9745
## [1] 12405
## [1] 14858
## [1] 17644
## [1] 19813
## [1] 21848
## Wed Nov 29 12:07:07 2017 --- Sucessfully merged files
# Annotate this expression table with Kegg Orthology and genome ids
expr_cov <- dplyr::left_join(expr_cov, merged_file[, c(1,10)], by = c("gene_oid"))
expr_cov$Plus_reads <- as.integer(expr_cov$Plus_reads)
expr_cov$Length <- as.integer(expr_cov$Length)
expr_cov <- expr_cov[!is.na(expr_cov$Plus_reads),]
sample_reads_metaT <- read.table("./metaT/sample_reads.tsv", header = TRUE)
# Move to long format dataframe for visualization in ggplot2
# expr_cov_long <- tidyr::gather(expr_cov, sample, Coverage, Fa13.BD.MLB.DN:Su13.BD.MM15.SN,
# factor_key = TRUE)
# Calculate average read recruitment to genes (for edgeR/DESeq) and also normalize to transcripts per million (TPM) just in case
# First read in gene lengths and total sample reads
# gene_lengths_metaT <- read.table("./metaT/genes_concat.len", header = TRUE)
# gene_lengths_metaT$gene <- as.character(gene_lengths_metaT$gene)
# Add this information to current long dataframe
expr_cov_long <- dplyr::left_join(expr_cov, sample_reads_metaT, by = c("Sample" = "sample"))
# expr_cov_long <- dplyr::left_join(expr_cov_long, gene_lengths_metaT, by = c("gene_oid" = "gene"))
# Now calculate average read recruitment and TPM (relative to the recruitment to these
# genomes in this dataset)
# 150 = read length
# expr_cov_long <- dplyr::mutate(expr_cov_long,
# mapped_reads = round((coverage * length)/(150),0)
# ) # Be aware that this average is already rounded here..
expr_cov_long <- dplyr::mutate(expr_cov_long,
reads_per_kb = Plus_reads/Length/1000
)
expr_cov_long <- expr_cov_long %>% group_by(Sample) %>%
mutate(RPK_scaling = sum(reads_per_kb)/1e6
)
expr_cov_long <- expr_cov_long %>%
mutate(TPM = reads_per_kb/RPK_scaling
)
# Now add the metadata to this long dataframe
expr_cov_long <- left_join(expr_cov_long, meta[, 1:11], by = c("Sample"))
expr_cov_long$Genome_id <- as.factor(expr_cov_long$Genome_id)
# Remove duplicate rows
expr_cov_long <- expr_cov_long %>% distinct()# Format data for DESeq2
## Put count matrices in list including a count matrix for each bin
colnames(expr_cov_long)[colnames(expr_cov_long) == "Plus_reads"] <- "mapped_reads"
expr_cov_bins <- list()
for(i in 1:nlevels(expr_cov_long$Genome_id)){
expr_cov_bins[[i]] <- expr_cov_long %>%
dplyr::filter(Genome_id == levels(expr_cov_long$Genome_id)[i]) %>%
dplyr::select(gene_oid, Sample, mapped_reads) %>%
tidyr::spread(Sample, mapped_reads)
r.bin <- expr_cov_bins[[i]]$gene_oid
expr_cov_bins[[i]] <- as.matrix(expr_cov_bins[[i]][, -1])
rownames(expr_cov_bins[[i]]) <- r.bin
}
# Metadata file
meta_metaT <- distinct(meta[, 2:nrow(meta)])
rownames(meta_metaT) <- gsub(meta_metaT$Sample_ID, pattern="_", replacement = ".")
meta_metaT <- as.matrix(meta_metaT)
# Check order of colnames in count and rownames in metadata matrix
# and make sure these are in the same order!
meta_metaT <- meta_metaT[match(colnames(expr_cov_bins[[1]]), rownames(meta_metaT)), ]
all(rownames(meta_metaT) %in% colnames(expr_cov_bins[[1]]))## [1] TRUE
# Report reads mapped for all genomes per sample
# Perform DESeq2 for differential abundance testing for each genome separately
## Season effect
General_deseq_results_season <- list()
for(i in 1:nlevels(expr_cov_long$Genome_id)){
cat(" --- Running DESeq2 on Genome_id:",levels(expr_cov_long$Genome_id)[i], "\n",sep = " ")
dds <- DESeqDataSetFromMatrix(countData = expr_cov_bins[[i]],
colData = meta_metaT,
design= ~ Site + Season) # Test for season but controlling for site
dds <- DESeq(dds)
General_deseq_results_season[[i]] <- results(dds)[order(results(dds)$padj), ] # specify contrasts here if need to
summary(results(dds, alpha = 0.05))
}## --- Running DESeq2 on Genome_id: 2757320395
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
##
## out of 2322 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 6, 0.26%
## LFC < 0 (down) : 11, 0.47%
## outliers [1] : 14, 0.6%
## low counts [2] : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
##
## --- Running DESeq2 on Genome_id: 2757320396
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
##
## out of 1398 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 6, 0.43%
## LFC < 0 (down) : 13, 0.93%
## outliers [1] : 10, 0.72%
## low counts [2] : 484, 35%
## (mean count < 17)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
##
## --- Running DESeq2 on Genome_id: 2757320397
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
##
## out of 1539 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 6, 0.39%
## LFC < 0 (down) : 7, 0.45%
## outliers [1] : 7, 0.45%
## low counts [2] : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
##
## --- Running DESeq2 on Genome_id: 2757320398
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
##
## out of 1949 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 6, 0.31%
## LFC < 0 (down) : 0, 0%
## outliers [1] : 7, 0.36%
## low counts [2] : 0, 0%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
##
## --- Running DESeq2 on Genome_id: 2757320399
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
##
## out of 2537 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 15, 0.59%
## LFC < 0 (down) : 14, 0.55%
## outliers [1] : 16, 0.63%
## low counts [2] : 147, 5.8%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
##
## --- Running DESeq2 on Genome_id: 2757320400
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
##
## out of 2660 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 5, 0.19%
## LFC < 0 (down) : 4, 0.15%
## outliers [1] : 17, 0.64%
## low counts [2] : 0, 0%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
##
## --- Running DESeq2 on Genome_id: 2757320401
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
##
## out of 2453 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 6, 0.24%
## LFC < 0 (down) : 16, 0.65%
## outliers [1] : 23, 0.94%
## low counts [2] : 141, 5.7%
## (mean count < 4)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
##
## --- Running DESeq2 on Genome_id: 2757320402
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
##
## out of 2786 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 20, 0.72%
## LFC < 0 (down) : 46, 1.7%
## outliers [1] : 26, 0.93%
## low counts [2] : 968, 35%
## (mean count < 16)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
##
## --- Running DESeq2 on Genome_id: 2757320403
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
##
## out of 2169 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 5, 0.23%
## LFC < 0 (down) : 6, 0.28%
## outliers [1] : 13, 0.6%
## low counts [2] : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
##
## --- Running DESeq2 on Genome_id: 2757320404
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
##
## out of 2035 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 2, 0.098%
## LFC < 0 (down) : 1, 0.049%
## outliers [1] : 10, 0.49%
## low counts [2] : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
## Site effect
General_deseq_results_site <- list()
for(i in 1:nlevels(expr_cov_long$Genome_id)){
cat(" --- Running DESeq2 on Genome_id:",levels(expr_cov_long$Genome_id)[i], "\n",sep = " ")
dds <- DESeqDataSetFromMatrix(countData = expr_cov_bins[[i]],
colData = meta_metaT,
design= ~ Season + Site) # Test for site but controlling for season
dds <- DESeq(dds)
General_deseq_results_site[[i]] <- results(dds)[order(results(dds)$padj), ] # specify contrasts here if need to
summary(results(dds, alpha = 0.05))
}## --- Running DESeq2 on Genome_id: 2757320395
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
##
## out of 2322 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 164, 7.1%
## LFC < 0 (down) : 159, 6.8%
## outliers [1] : 14, 0.6%
## low counts [2] : 90, 3.9%
## (mean count < 4)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
##
## --- Running DESeq2 on Genome_id: 2757320396
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
##
## out of 1398 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 78, 5.6%
## LFC < 0 (down) : 97, 6.9%
## outliers [1] : 10, 0.72%
## low counts [2] : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
##
## --- Running DESeq2 on Genome_id: 2757320397
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
##
## out of 1539 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 201, 13%
## LFC < 0 (down) : 190, 12%
## outliers [1] : 7, 0.45%
## low counts [2] : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
##
## --- Running DESeq2 on Genome_id: 2757320398
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
##
## out of 1949 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 240, 12%
## LFC < 0 (down) : 330, 17%
## outliers [1] : 7, 0.36%
## low counts [2] : 0, 0%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
##
## --- Running DESeq2 on Genome_id: 2757320399
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
##
## out of 2537 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 164, 6.5%
## LFC < 0 (down) : 248, 9.8%
## outliers [1] : 16, 0.63%
## low counts [2] : 49, 1.9%
## (mean count < 3)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
##
## --- Running DESeq2 on Genome_id: 2757320400
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
##
## out of 2660 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 101, 3.8%
## LFC < 0 (down) : 159, 6%
## outliers [1] : 17, 0.64%
## low counts [2] : 463, 17%
## (mean count < 10)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
##
## --- Running DESeq2 on Genome_id: 2757320401
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
##
## out of 2453 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 160, 6.5%
## LFC < 0 (down) : 182, 7.4%
## outliers [1] : 23, 0.94%
## low counts [2] : 48, 2%
## (mean count < 2)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
##
## --- Running DESeq2 on Genome_id: 2757320402
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
##
## out of 2786 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 125, 4.5%
## LFC < 0 (down) : 120, 4.3%
## outliers [1] : 26, 0.93%
## low counts [2] : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
##
## --- Running DESeq2 on Genome_id: 2757320403
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
##
## out of 2169 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 113, 5.2%
## LFC < 0 (down) : 172, 7.9%
## outliers [1] : 13, 0.6%
## low counts [2] : 85, 3.9%
## (mean count < 4)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
##
## --- Running DESeq2 on Genome_id: 2757320404
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
##
## out of 2035 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 105, 5.2%
## LFC < 0 (down) : 158, 7.8%
## outliers [1] : 10, 0.49%
## low counts [2] : 40, 2%
## (mean count < 3)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# Pool results for each genome into a single result dataframe
res_deseq <- data.frame()
for(i in 1:length(General_deseq_results_season)){
tmp <- data.frame(gene_oid = General_deseq_results_season[[i]]@rownames,
baseMean = General_deseq_results_season[[i]]@listData$baseMean,
log2FoldChange = General_deseq_results_season[[i]]@listData$log2FoldChange,
pvalue = General_deseq_results_season[[i]]@listData$pvalue,
padj = General_deseq_results_season[[i]]@listData$padj,
Genome_ID = levels(expr_cov_long$Genome_id)[i],
Comparison = "Season: Summer vs Fall")
if(i == 1) res_deseq <- tmp else{
res_deseq <- rbind(res_deseq, tmp)
}
}
for(i in 1:length(General_deseq_results_site)){
tmp <- data.frame(gene_oid = General_deseq_results_site[[i]]@rownames,
baseMean = General_deseq_results_site[[i]]@listData$baseMean,
log2FoldChange = General_deseq_results_site[[i]]@listData$log2FoldChange,
pvalue = General_deseq_results_site[[i]]@listData$pvalue,
padj = General_deseq_results_site[[i]]@listData$padj,
Genome_ID = levels(expr_cov_long$Genome_id)[i],
Comparison = "Site: MLB vs M110")
res_deseq <- rbind(res_deseq, tmp)
}
# Only retain significantly differentially expressed genes
res_deseq <- res_deseq %>% dplyr::filter(padj < 0.05)
new_bin_names2 <- read.table("./anvio_output/rebin/general_bins_summary_selected_final.tsv", header = TRUE)[, c(3,8:10)]; new_bin_names2$IMG_taxID <- as.character(new_bin_names2$IMG_taxID)
res_deseq <- left_join(res_deseq, new_bin_names2, by = c("Genome_ID" = "IMG_taxID"))
res_deseq$new_bin_name <- as.character(res_deseq$new_bin_name)
res_deseq$new_bin_name <- factor(res_deseq$new_bin_name, levels =
c("MAG1.FA-MLB-DN","MAG2.FA-MLB-SN",
"MAG3.FA-MLB-SN", "MAG4.FA-M110-DN",
"MAG5.SP-M110-DD", "MAG6.SP-M15-SD",
"MAG7.SU-MLB-SD", "MAG8.SU-M110-DCMD",
"MAG9.SU-M15-SN", "MAG10.SU-M15-SN"))# Plot for each genome the number of differentially expressed genes according to:
# season and site
p_deseq_1 <- ggplot2::ggplot(res_deseq, aes(x = new_bin_name, fill = new_bin_name))+
geom_bar(color = "black")+
theme_bw()+
scale_fill_brewer(palette = "Paired")+
theme(axis.text.x = element_text(angle = 45, size = 14, hjust = 1),
axis.text.y = element_text(size = 14),
legend.title = element_blank(),
axis.title = element_text(size = 14),
strip.text = element_text(size = 14))+
ylab("Counts")+
xlab("")+
facet_grid(~Comparison)
print(p_deseq_1)In order to check unspecific mapping sample reads were mapped consecutively to every bin using BBmap.sh. This approach allows us to check the mapping specificity by evaluating the distribution of read identity to the putative genome bin.
Competitive mapping was performed through blastn searches against a 1M and 10M read subsample from the interleaved fasta generated after QC (seqtk). Shell script used to achieve this:
#!/bin/bash
set -e
for file in `cat map.list`
do
echo $file
db=/scratch/vdenef_fluxm/rprops/DESMAN/metaG/vizbin_rebin_anvio_v230/SEQ_discrete/contigs/merged_bins-fixed.db
seqtk sample -s 777 /scratch/vdenef_fluxm/rprops/DESMAN/metaG/data/${file}/dt_int.fasta 1000000 > /scratch/vdenef_fluxm/rprops/DESMAN/metaG/data/${file}/dt_int_subs_1000000.fasta
qseqs=/scratch/vdenef_fluxm/rprops/DESMAN/metaG/data/${file}/dt_int_subs_1000000.fasta
blastn -query ${qseqs} -task megablast -db ${db} -out ./blast_output/${file}_blast.tsv -outfmt 6 -max_target_seqs 1 -num_threads 40 -perc_identity 60
rm ${qseqs}
done
blast with 1M reads of interleaved fasta.blast_df <- read.table("./SEQs_discrete/merged_blast_1M.tsv", header = FALSE)
colnames(blast_df) <- c("Sample", "Contig", "Identity")
blast_df_map <- read.table("./SEQs_discrete/merged_contig_list.tsv", header = FALSE)
colnames(blast_df_map) <- c("m_contig", "o_contig", "bin")
# Round identity to integer
blast_df$Identity <- round(blast_df$Identity, 1)
blast_df$Identity <- factor(blast_df$Identity)
blast_df <- left_join(blast_df, blast_df_map, by = c("Contig" = "m_contig"))
blast_df_sum <- blast_df %>% group_by(Sample, bin) %>% count(Identity)
blast_df_sum$Sample <- gsub("_blast.tsv", "", blast_df_sum$Sample)
blast_df_sum <- data.frame(blast_df_sum)
blast_df_sum$Identity <- as.numeric(as.character(blast_df_sum$Identity))
blast_df_sum$bin <- gsub(".fa","",blast_df_sum$bin)
blast_df_sum <- dplyr::left_join(blast_df_sum, bin_size, by = c("bin" = "bins"))
# Normalize mapped reads per sample based on genome size
blast_df_sum <- blast_df_sum %>% group_by(bin) %>%
mutate(n_norm = 1e6*n/bin_size)
# Normalize mapped reads per sample based on sample reads
# blast_subs <- 1e6
# blast_df_sum <- blast_df_sum %>% ungroup %>% group_by(Sample) %>%
# mutate(n_prop = 100*n/blast_subs)
# Add season variable
blast_df_sum$season <- "Summer"
blast_df_sum$season[grep("Fa", blast_df_sum$Sample)] <- "Fall"
blast_df_sum$season[grep("Su", blast_df_sum$Sample)] <- "Summer"
blast_df_sum$season[grep("Sp", blast_df_sum$Sample)] <- "Spring"
# Reformat sample names
blast_df_sum$Sample <- gsub(".C", "", blast_df_sum$Sample, fixed = TRUE)
blast_df_sum$Sample <- gsub(".A", "", blast_df_sum$Sample, fixed = TRUE)
blast_df_sum$Sample <- gsub(".", "_", blast_df_sum$Sample, fixed = TRUE)
# Add metadata to dataframe
meta_blast <- meta[, -1] %>% distinct()
blast_df_sum <- dplyr::left_join(blast_df_sum, meta_blast, by = c("Sample" = "Sample_ID"))
# Reorder site factor
blast_df_sum$Site <- as.character(blast_df_sum$Site)
blast_df_sum$Site <- gsub("Buoy","Muskegon Lake", blast_df_sum$Site)
blast_df_sum$Site <- gsub("110","Lake Michigan\nsite M110", blast_df_sum$Site)
blast_df_sum$Site <- gsub("15","Lake Michigan\nsite M15", blast_df_sum$Site)
blast_df_sum$Site <- factor(blast_df_sum$Site, levels = c("Muskegon Lake",
"Lake Michigan\nsite M15",
"Lake Michigan\nsite M110"))
blast_df_sum$Depth <- as.character(blast_df_sum$Depth)
blast_df_sum$Depth <- factor(blast_df_sum$Depth, levels = c("Surface", "Mid", "Deep"))
blast_df_sum$season <- as.character(blast_df_sum$season)
blast_df_sum$season <- factor(blast_df_sum$season, levels = c("Spring", "Summer", "Fall"))
# remove non-Limnohabbitans bin
blast_df_sum <- blast_df_sum %>% dplyr::filter(bin != "B2_Fa13.BD.MLB.DN_rebin10")
# Add extra column with new bin names
new_bin_names <- read.table("./anvio_output/rebin/general_bins_summary_selected_final.tsv", header = TRUE)[, c(2,3)]
blast_df_sum <- left_join(blast_df_sum, new_bin_names, by = c("bin" = "bins"))
blast_df_sum$new_bin_name <- as.character(blast_df_sum$new_bin_name)
blast_df_sum$new_bin_name <- factor(blast_df_sum$new_bin_name, levels =
c("MAG1.FA-MLB-DN","MAG2.FA-MLB-SN",
"MAG3.FA-MLB-SN", "MAG4.FA-M110-DN",
"MAG5.SP-M110-DD","MAG6.SP-M15-SD",
"MAG7.SU-MLB-SD","MAG8.SU-M110-DCMD",
"MAG9.SU-M15-SN","MAG10.SU-M15-SN"))
# plot for individual bins
for(bin2plot in unique(blast_df_sum$new_bin_name)){
p_blast_sdisc <- blast_df_sum %>% dplyr::filter(new_bin_name == bin2plot) %>%
ggplot(aes(x = Identity, y = n_norm, color = season))+
theme_bw()+
scale_color_brewer(palette = "Accent")+
facet_wrap(~Sample, nrow = 4)+
geom_line(size = 1.5)+
guides(color = FALSE)+
ggtitle(bin2plot)+
theme(axis.text=element_text(size=14), axis.title=element_text(size=20),
title=element_text(size=20), legend.text=element_text(size=14),
legend.background = element_rect(fill="transparent"),
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text.y=element_text(size=14))+
ylab("Reads per Mbp")+
xlab("Nucleotide identity (%)")+
xlim(75,100)
print(p_blast_sdisc)
}# Plot one combined figure with proportions normalized for genome size
p_blast_sdisc_merged <- blast_df_sum %>%
ggplot(aes(x = Identity, y = n_norm, color = new_bin_name))+
theme_gray()+
scale_color_brewer("", palette = "Paired")+
facet_wrap(~Sample, nrow = 4)+
geom_line(size = 1.5, alpha = 0.7)+
ggtitle("Merged bins per sample")+
theme(axis.text=element_text(size=14), axis.title=element_text(size=20),
title=element_text(size=20), legend.text=element_text(size=14),
legend.background = element_rect(fill="transparent"),
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text.y=element_text(size=14),
legend.position = "bottom")+
ylab("Reads per Mbp")+
xlab("Nucleotide identity (%)")+
xlim(75,100)+
guides(color=guide_legend(ncol=3))
print(p_blast_sdisc_merged)# Plot for most abundant bin (B63)
p_blast_sdisc_B63 <- blast_df_sum %>% dplyr::filter(bin == "B63_Su13.BD.MM110.DCMD_rebin1") %>%
ggplot(aes(x = Identity, y = n_norm, fill = season, group = Sample,
shape = Depth))+
theme_bw()+
facet_grid(season~Site)+
geom_line(size = 2, color = adjustcolor("black",0.5))+
geom_point(size = 3, alpha = 0.6)+
scale_shape_manual("", values = c(21,22,24))+
scale_fill_brewer(palette = "Accent")+
guides(color = FALSE, fill = FALSE)+
# ggtitle(bin2plot)+
theme(axis.text=element_text(size=14), axis.title=element_text(size=20),
title=element_text(size=20), legend.text=element_text(size=14),
legend.background = element_rect(fill="transparent"),
axis.text.x = element_text(angle = 45, hjust = 1),
strip.text=element_text(size=16),
panel.grid.minor = element_blank(),
legend.position = "bottom")+
ylab("Reads per Mbp")+
xlab("Nucleotide identity (%)")+
xlim(75,100)
p_blast_sdisc_B63blast_df_sum <- left_join(blast_df_sum, total_reads, by = c("Sample" = "sample"))
# Divide normalized reads by 1M (fixed blast census)
blast_df_sum <- blast_df_sum %>% mutate(n_norm_perc = 100*n_norm/1e6)
fill_palette <- c(rev(brewer.pal(11, "BrBG")[1:4]),
brewer.pal(11, "PiYG")[9:10],
brewer.pal(9, "PuBu")[4:7]
)
# Plot % reads corrected for genome size over threshold of 0.95
id_thresh <- 95
map_disc_cum <- blast_df_sum %>% dplyr::filter(Identity > id_thresh) %>% group_by(Sample, bin) %>%
mutate(cum_rel_reads_mapped = cumsum(n_norm_perc))%>%
dplyr::filter(Identity == 100)
sum_cum <- map_disc_cum %>% group_by(Sample, bin) %>% mutate(cum_bins_rel_reads_mapped = sum(cum_rel_reads_mapped))
p_sdisc_cum3 <- ggplot(map_disc_cum, aes(x = new_bin_name, y = cum_rel_reads_mapped,
fill = new_bin_name))+
theme_bw()+
scale_fill_manual("", values = fill_palette)+
geom_jitter(size = 4, shape = 21, color = "black", alpha = 0.7, width = 0.15)+
theme(axis.text=element_text(size=14), axis.title=element_text(size=20),
title=element_text(size=20), legend.text=element_text(size=12),
legend.background = element_rect(fill="transparent"),
axis.text.x = element_blank(),
strip.text=element_text(size=14), legend.position = "bottom",
strip.background = element_rect(fill = adjustcolor("gray", 0.15)))+
ylab(paste0("Norm. relative abundance ( > ", id_thresh, "% NI)"))+
xlab("")+
guides(fill=guide_legend(nrow = 3))+
facet_grid(season~Site, scales ="free")+
scale_y_continuous(labels=scaleFUN, limits = c(0,3))+
coord_trans(y = "sqrt")
print(p_sdisc_cum3)# Plot % reads over threshold of 0.99
id_thresh <- 99
map_disc_cum <- blast_df_sum %>% distinct() %>%
dplyr::filter(Identity > id_thresh) %>% group_by(Sample, bin) %>%
mutate(cum_rel_reads_mapped = cumsum(n_norm_perc))%>%
dplyr::filter(Identity == 100)
sum_cum <- map_disc_cum %>% group_by(Sample, bin) %>% mutate(cum_bins_rel_reads_mapped = sum(cum_rel_reads_mapped))
p_sdisc_cum4 <- ggplot(map_disc_cum, aes(x = new_bin_name, y = cum_rel_reads_mapped,
fill = new_bin_name))+
theme_bw()+
scale_fill_manual("", values = fill_palette)+
geom_jitter(size = 4, shape = 21, color = "black", alpha = 0.7, width = 0.15)+
theme(axis.text=element_text(size=14), axis.title=element_text(size=20),
title=element_text(size=20), legend.text=element_text(size=12),
legend.background = element_rect(fill="transparent"),
axis.text.x = element_blank(),
strip.text=element_text(size=14), legend.position = "bottom",
strip.background = element_rect(fill = adjustcolor("gray", 0.15)))+
ylab(paste0("Norm. relative abundance ( > ", id_thresh, "% NI)"))+
xlab("")+
guides(fill=guide_legend(nrow = 3))+
facet_grid(season~Site, scales ="free")+
scale_y_continuous(labels=scaleFUN, limits = c(0,1.25))+
coord_trans(y = "sqrt")
print(p_sdisc_cum4)# Compare blast inferred abundances with bwa inferred abundances
grid_arrange_shared_legend(p_abs2, p_sdisc_cum4, ncol = 2)# Merge those two dataframes and make scatter plot highlighting the differences
df_map_merged_sm <- df_map_merged[, c("Bin","Sample","rel_norm_abundance", "Season", "Site")]
df_map_merged_sm$Sample <- gsub(".", "_", df_map_merged_sm$Sample, fixed = TRUE)
df_map_merged_sm <- data.frame(df_map_merged_sm,
interaction = interaction(df_map_merged_sm$Bin,
df_map_merged_sm$Sample),
method = "bwa")
map_disc_cum_sm <- map_disc_cum[, c("bin","Sample","cum_rel_reads_mapped")]
map_disc_cum_sm <- data.frame(map_disc_cum_sm,
interaction = interaction(map_disc_cum_sm$bin,
map_disc_cum_sm$Sample),
method = "blastn")
abs_merged <- left_join(df_map_merged_sm, map_disc_cum_sm, by = "interaction")
abs_merged <- abs_merged[,c("Bin","Sample.x","rel_norm_abundance","cum_rel_reads_mapped",
"Season", "Site")]
colnames(abs_merged) <- c("Bin","Sample",
"bwa_norm_abundance","blastn_norm_abundance",
"Season", "Site")
# Plot
p_scat_abund <- ggplot(abs_merged, aes(x = bwa_norm_abundance, y = blastn_norm_abundance))+
theme_bw()+
geom_point(size = 4, shape = 21, color = "black", alpha = 0.7,
aes(fill = Bin))+
scale_fill_manual("", values = fill_palette)+
theme(axis.text=element_text(size=14), axis.title=element_text(size=20),
title=element_text(size=20), legend.text=element_text(size=12),
legend.background = element_rect(fill="transparent"),
strip.text=element_text(size=14), legend.position = "bottom",
strip.background = element_rect(fill = adjustcolor("gray", 0.15)))+
ylab(paste0("Norm. relative abundance (blastn, > ", id_thresh, "% NI)"))+
xlab("Norm. relative abundance (BWA)")+
guides(fill=guide_legend(nrow = 3))+
facet_grid(Season~Site, scales ="free")+
scale_y_continuous(labels=scaleFUN)+
scale_x_continuous(labels=scaleFUN)+
geom_smooth(method = "lm", color = "black")
# coord_trans(x = "atanh", y = "atanh")
print(p_scat_abund)abs_merged_div <- map_disc_cum[, c("Sample", "new_bin_name", "n_norm")]
abs_merged_div$n_norm <- as.integer(abs_merged_div$n_norm )
# Format long to wide format
abs_table <- spread(abs_merged_div, Sample, n_norm)
abs_table <- data.frame(abs_table)
rownames(abs_table) <- abs_table[, 1]; abs_table <- abs_table[, -1]
tax.table <- data.frame(MAGs = rownames(abs_table))
rownames(tax.table) <- tax.table$MAGs
# Make phyloseq object
MAG_phy <- phyloseq(otu_table(abs_table, taxa_are_rows = TRUE),
tax_table(as.matrix(tax.table)))
# Run Diversity_16S()
MAG_div <- Diversity_16S(MAG_phy, ncore = 3, parallel = TRUE,
R = 100, brea = FALSE)## **WARNING** this functions assumes that rows are samples and columns
## are taxa in your phyloseq object, please verify.
## Wed Nov 29 12:12:41 2017 Using 3 cores for calculations
## Wed Nov 29 12:12:41 2017 Calculating diversity for sample 1/24 --- Fa13_BD_MLB_DN
## Wed Nov 29 12:12:53 2017 Done with sample Fa13_BD_MLB_DN
## Wed Nov 29 12:12:53 2017 Calculating diversity for sample 2/24 --- Fa13_BD_MLB_SN
## Wed Nov 29 12:12:56 2017 Done with sample Fa13_BD_MLB_SN
## Wed Nov 29 12:12:56 2017 Calculating diversity for sample 3/24 --- Fa13_BD_MM110_DN
## Wed Nov 29 12:12:58 2017 Done with sample Fa13_BD_MM110_DN
## Wed Nov 29 12:12:58 2017 Calculating diversity for sample 4/24 --- Fa13_BD_MM110_SD
## Wed Nov 29 12:13:01 2017 Done with sample Fa13_BD_MM110_SD
## Wed Nov 29 12:13:02 2017 Calculating diversity for sample 5/24 --- Fa13_BD_MM110_SN
## Wed Nov 29 12:13:04 2017 Done with sample Fa13_BD_MM110_SN
## Wed Nov 29 12:13:04 2017 Calculating diversity for sample 6/24 --- Fa13_BD_MM15_DN
## Wed Nov 29 12:13:06 2017 Done with sample Fa13_BD_MM15_DN
## Wed Nov 29 12:13:06 2017 Calculating diversity for sample 7/24 --- Fa13_BD_MM15_SD
## Wed Nov 29 12:13:09 2017 Done with sample Fa13_BD_MM15_SD
## Wed Nov 29 12:13:09 2017 Calculating diversity for sample 8/24 --- Fa13_BD_MM15_SN
## Wed Nov 29 12:13:11 2017 Done with sample Fa13_BD_MM15_SN
## Wed Nov 29 12:13:11 2017 Calculating diversity for sample 9/24 --- Sp13_BD_MLB_SN
## Wed Nov 29 12:13:14 2017 Done with sample Sp13_BD_MLB_SN
## Wed Nov 29 12:13:14 2017 Calculating diversity for sample 10/24 --- Sp13_BD_MM110_DD
## Wed Nov 29 12:13:16 2017 Done with sample Sp13_BD_MM110_DD
## Wed Nov 29 12:13:16 2017 Calculating diversity for sample 11/24 --- Sp13_BD_MM110_SD
## Wed Nov 29 12:13:18 2017 Done with sample Sp13_BD_MM110_SD
## Wed Nov 29 12:13:18 2017 Calculating diversity for sample 12/24 --- Sp13_BD_MM110_SN
## Wed Nov 29 12:13:21 2017 Done with sample Sp13_BD_MM110_SN
## Wed Nov 29 12:13:21 2017 Calculating diversity for sample 13/24 --- Sp13_BD_MM15_DD
## Wed Nov 29 12:13:23 2017 Done with sample Sp13_BD_MM15_DD
## Wed Nov 29 12:13:23 2017 Calculating diversity for sample 14/24 --- Sp13_BD_MM15_SD
## Wed Nov 29 12:13:25 2017 Done with sample Sp13_BD_MM15_SD
## Wed Nov 29 12:13:25 2017 Calculating diversity for sample 15/24 --- Sp13_BD_MM15_SN
## Wed Nov 29 12:13:28 2017 Done with sample Sp13_BD_MM15_SN
## Wed Nov 29 12:13:28 2017 Calculating diversity for sample 16/24 --- Su13_BD_MLB_DD
## Wed Nov 29 12:13:30 2017 Done with sample Su13_BD_MLB_DD
## Wed Nov 29 12:13:30 2017 Calculating diversity for sample 17/24 --- Su13_BD_MLB_SD
## Wed Nov 29 12:13:32 2017 Done with sample Su13_BD_MLB_SD
## Wed Nov 29 12:13:32 2017 Calculating diversity for sample 18/24 --- Su13_BD_MM110_DCMD
## Wed Nov 29 12:13:34 2017 Done with sample Su13_BD_MM110_DCMD
## Wed Nov 29 12:13:34 2017 Calculating diversity for sample 19/24 --- Su13_BD_MM110_DN
## Wed Nov 29 12:13:37 2017 Done with sample Su13_BD_MM110_DN
## Wed Nov 29 12:13:37 2017 Calculating diversity for sample 20/24 --- Su13_BD_MM110_SD
## Wed Nov 29 12:13:39 2017 Done with sample Su13_BD_MM110_SD
## Wed Nov 29 12:13:39 2017 Calculating diversity for sample 21/24 --- Su13_BD_MM110_SN
## Wed Nov 29 12:13:41 2017 Done with sample Su13_BD_MM110_SN
## Wed Nov 29 12:13:41 2017 Calculating diversity for sample 22/24 --- Su13_BD_MM15_DN
## Wed Nov 29 12:13:43 2017 Done with sample Su13_BD_MM15_DN
## Wed Nov 29 12:13:43 2017 Calculating diversity for sample 23/24 --- Su13_BD_MM15_SD
## Wed Nov 29 12:13:45 2017 Done with sample Su13_BD_MM15_SD
## Wed Nov 29 12:13:45 2017 Calculating diversity for sample 24/24 --- Su13_BD_MM15_SN
## Wed Nov 29 12:13:47 2017 Done with sample Su13_BD_MM15_SN
## Wed Nov 29 12:13:47 2017 Closing workers
## Wed Nov 29 12:13:47 2017 Done with all 24 samples
MAG_div <- data.frame(Sample = rownames(MAG_div), MAG_div[,-c(3:6)])
# Merge with metadata
MAG_div <- left_join(MAG_div, meta_blast, by = c("Sample" = "Sample_ID"))
# Annotate and order metavariables
MAG_div$Site <- as.character(MAG_div$Site)
MAG_div$Site <- gsub("Buoy","Muskegon Lake", MAG_div$Site)
MAG_div$Site <- gsub("110","Lake Michigan\nsite M110", MAG_div$Site)
MAG_div$Site <- gsub("15","Lake Michigan\nsite M15", MAG_div$Site)
MAG_div$Site <- factor(MAG_div$Site, levels = c("Muskegon Lake",
"Lake Michigan\nsite M15",
"Lake Michigan\nsite M110"))
MAG_div$Depth <- as.character(MAG_div$Depth)
MAG_div$Depth <- factor(MAG_div$Depth, levels = c("Surface", "Mid", "Deep"))
MAG_div$Season <- as.character(MAG_div$Season)
MAG_div$Season <- factor(MAG_div$Season, levels = c("Spring", "Summer", "Fall"))
# Plot results
p_MAG_div <- ggplot(MAG_div, aes(x = Season, y = D2,
fill = Season, shape = Depth))+
theme_bw()+
scale_fill_brewer(palette = "Accent")+
geom_point(size = 4, color = "black", alpha = 0.7)+
scale_shape_manual(values = c(21,24,23))+
# geom_boxplot(alpha = 0.4)+
theme(axis.text=element_text(size=14), axis.title=element_text(size=20),
title=element_text(size=20), legend.text=element_text(size=12),
legend.background = element_rect(fill="transparent"),
axis.text.x = element_text(size = 14, angle = 45, hjust = 1),
strip.text=element_text(size=14), legend.position = "bottom",
strip.background = element_rect(fill = adjustcolor("gray", 0.15)))+
ylab(paste0("Limnohabitans population\n diversity (D2)"))+
guides(fill=FALSE)+
facet_grid(~Site, scales ="free")+
xlab("")+
geom_errorbar(aes(ymin = D2 - sd.D2, ymax = D2 + sd.D2), width = 0.05)+
scale_y_continuous(labels=scaleFUN, limits = c(0,6.5))
# coord_trans(y = "sqrt")
print(p_MAG_div)